# Supplemental Online Resources Improve Political Methods Education
# Marco Alcocer, Leonardo Falabella, Alexandra Lange, Nicholas Smith, and
# Maureen Feeley
# Replication code (Appendix)

# Packages required -------------------------------------------------------

library(AER)
library(here)
library(tidyverse)
library(sandwich)
library(lmtest)
library(stargazer)
library(gridExtra)
library(vtable)
library(future)

# We use the ivDiag package to calculate cluster-robust F statistics for the
# instrumental variable regressions. The package is not yet available on CRAN as
# of April 30, 2022 (it can be found at https://github.com/apoorvalal/ivDiag.)
# To download and install the package:
library(remotes)
install_github("apoorvalal/ivDiag")
library(ivDiag)

# Read data ---------------------------------------------------------------

dfd <- as_tibble(readRDS(here("replication", 
                              "data.rds")))

# cluster_se function -----------------------------------------------------

# Create cluster_se function to extract standard errors clustered by student for
# each regression model
cluster_se <- function(x) 
  coeftest(x, vcov = vcovCL, cluster = ~ DeID)[, "Std. Error"]

# Wrangle data from student-question to student level --------------------

# Data frame will be used for to generate descriptive plots and tables in the
# appendix.
pdat <- dfd %>%
  group_by(DeID) %>%
  summarise(
    views = sum(n_views),
    quizzes = sum(quizzes_taken),
    gpa = mean(gpa, na.rm = TRUE),
    mean_score = mean(pct_score)
  ) %>%
  # Did the student view at least one page? Take at least one quiz?
  mutate(viewed_page = factor(ifelse(views > 0, "Yes", "No"),
                              levels = c("Yes", "No")),
         took_quiz = factor(ifelse(quizzes > 0, "Yes", "No"),
                            levels = c("Yes", "No"))) %>% 
  left_join(
    dfd %>%
      distinct(DeID, .keep_all = TRUE) %>%
      select(DeID, urm),
    by = "DeID"
  )

# Appendix Figure 1: Randomization Inference ------------------------------

# Subset variables in main dataset
dfdri <- dfd %>% 
  select(DeID, treatment_group, 
         question, treatment, pct_score)

# Read dataset with correspondences between learning modules, treatment groups,
# and questions (as in the Pre-Analysis Plan)
mtq <- readRDS(here::here("replication",
                          "module_treatgroup_question.rds"))

# Create a data frame informing which questions are treated for each treatment
# group (based upon mtq)
# Necessary to simulate placebo statuses
tqs <- dfdri %>% distinct(treatment_group, question)
for (i in 1:nrow(tqs)) {
  tqs$treated[i] <-
    ifelse(sum(mtq[which(mtq[, paste0("tg",
                                      tqs[i, ]$treatment_group)] == 1),
                   tqs[i, ]$question]) > 0,
           1, 0)
}

# Set seed
set.seed(220419)
# Set number of simulations
nsims <- 5000
# Placeholder list for placebo statuses
placebo_statuses <- vector(mode = "list", length = nsims)
# Placeholder list for regression models
randinf_models <- vector(mode = "list", length = nsims)
# Placeholder vector for placebo coefficients
placebo_coefs <- rep(NA, nsims)

# Simulate placebo statuses, run regressions, and retrieve coefficients
for(i in 1:nsims) {
  # Generate list containing dataframes with different placebo statuses
  placebo_statuses[[i]] <- dfdri %>% distinct(DeID) %>%
    mutate(placebo_group = sample(1:6,
                                  nrow(dfdri %>% distinct(DeID)),
                                  replace = TRUE)) %>%
    right_join(dfdri, by = "DeID") %>%
    left_join((tqs %>% rename(placebo_group = treatment_group)),
              by = c("placebo_group", "question")) %>%
    rename(placebo = treated)
  # Run regressions with placebo treatments
  randinf_models[[i]] <- lm(
    pct_score ~ placebo +
      factor(DeID) +
      factor(question),
    data = placebo_statuses[[i]]
  )
  # Retrieve coefficients for placebo
  placebo_coefs[i] <- randinf_models[[i]]$coefficients["placebo"]
}

# Remove objects
rm(randinf_models, placebo_statuses, 
   mtq, tqs)

# Obtain observed coefficient (under observed treatment status)
observed_coef <- lm(pct_score ~ treatment + 
                      factor(DeID) + 
                      factor(question), 
                    data = dfdri)$coefficients[2]

# P-value for sharp null test
sum(placebo_coefs > observed_coef)/nsims

# Histogram with placebo coefficients (n = nsims); vertical line depicting the
# observed coefficient
riplot <- ggplot() +
  geom_histogram(
    aes(placebo_coefs),
    color = "black",
    fill = "white",
    breaks = seq(-5, 5, 0.5)
  ) +
  geom_vline(aes(xintercept = observed_coef),
             linetype = "dashed") +
  labs(
    x = "Coefficients",
    y = "Frequency",
    title = expression(paste("Coefficients for ",
                             italic(Placebo[iq]))),
    subtitle = expression(
      paste(
        "Vertical line depicting the observed coefficient for ",
        italic(Treatment[iq])
      )
    )
  ) +
  scale_x_continuous(breaks = -5:5) +
  scale_y_continuous(breaks = seq(0, 750, 250)) +
  theme_bw() +
  theme(
    plot.title = element_text(size = 14),
    plot.subtitle = element_text(size = 10),
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 12),
    strip.text.x = element_text(size = 12)
  )

# Save plot
ggsave(here("replication",
            "figures",
            "randomization_inference.pdf"),
       plot = riplot,
       width = 13.5,
       height = 7.25,
       units = "cm")

# Remove objects
rm(dfdri, riplot, observed_coef, placebo_coefs, i, nsims)

# Appendix Figure 2: Module use distribution ------------------------------

# Page views, histogram
hview <- ggplot(pdat, aes(x = views)) +
  geom_histogram(binwidth = 10,
                 boundary = 0, 
                 closed = "left",
                 color = "black",
                 fill = "white") +
  scale_x_continuous(breaks = seq(0, 140, 20)) +
  scale_y_continuous(breaks = seq(0, 40, 5)) +
  geom_vline(xintercept = median(pdat$views),
             linetype = "dashed") +
  theme_bw() +
  labs(x = "Page views",
       y = "Frequency")

# Quiz completions, histogram
hquiz <- ggplot(pdat, aes(x = quizzes)) +
  geom_histogram(binwidth = 1,
                 color = "black",
                 fill = "white") +
  scale_x_continuous(breaks = 0:8) +
  scale_y_continuous(breaks = seq(0, 70, 10)) +
  geom_vline(xintercept = median(pdat$quizzes),
             linetype = "dashed") +
  theme_bw() +
  labs(x = "Quiz completions",
       y = "Frequency")

# Arrange, save
huse <- grid.arrange(hview, hquiz, ncol = 2)
ggsave(here("replication",
            "figures",
            "hist_use.pdf"),
       plot = huse,
       width = 13.5,
       height = 7.25,
       units = "cm")

# Remove plot objects
rm(hview, hquiz, huse)

# Appendix Table 1: Compliance to treatment, first stage ------------------

# Estimate regression models
ivfs_mdls <- list(
  # IV, first stage, Model 1: viewed a page, question FE
  ivfs_m1 <- lm(viewed_page ~ treatment +
                  factor(DeID) +
                  factor(question),
                data = dfd),
  # IV, first stage, Model 2: viewed a page, no question FE
  ivfs_m2 <- lm(viewed_page ~ treatment +
                  factor(DeID),
                data = dfd),
  # IV, first stage, Model 3: completed a quiz, question FE
  ivfs_m3 <- lm(took_quiz ~ treatment +
                  factor(DeID) +
                  factor(question),
                data = dfd),
  # IV, first stage, Model 4: completed a quiz, no question FE
  ivfs_m4 <- lm(took_quiz ~ treatment +
                  factor(DeID),
                data = dfd)
)

# Estimate first stage cluster robust F statistics
dfd_ivf <-
  data.frame(
    pct_score = dfd$pct_score,
    viewed_page = dfd$viewed_page,
    took_quiz = dfd$took_quiz,
    treatment = dfd$treatment,
    DeID = dfd$DeID,
    question = dfd$question
  )

Y <-  "pct_score"
D1 <- "viewed_page"
D2 <- "took_quiz"
Z <- "treatment"
cl <- "DeID"
FE <- c("DeID", "question")
FE2 <- c("DeID")

# IV, first stage, Model 1: viewed a page, question FE
fst1 = boot_IV(
  Y = Y,
  D = D1,
  Z = Z,
  FE = FE,
  cl = cl,
  nboot = 1000,
  data = dfd_ivf
)$F_stat[3]

# IV, first stage, Model 3: completed a quiz, question FE
fst3 = boot_IV(
  Y = Y,
  D = D2,
  Z = Z,
  FE = FE,
  cl = cl,
  nboot = 1000,
  data = dfd_ivf
)$F_stat[3]

# IV, first stage, Model 2: viewed a page, no question FE
fst2 = boot_IV(
  Y = Y,
  D = D1,
  Z = Z,
  FE = FE2,
  cl = cl,
  nboot = 1000,
  data = dfd_ivf
)$F_stat[3]

# IV, first stage, Model 4: completed a quiz, no question FE
fst4 = boot_IV(
  Y = Y,
  D = D2,
  Z = Z,
  FE = FE2,
  cl = cl,
  nboot = 1000,
  data = dfd_ivf
)$F_stat[3]

# Produce regression table
stargazer(
  ivfs_mdls,
  header = F,
  font.size = "small",
  df = F,
  keep = "treatment",
  se = lapply(ivfs_mdls,
              cluster_se),
  add.lines = list(c("Cluster-robust IV F-Stat", fst1, fst2, fst3, fst4),
                   c("Student FE", rep("Yes", 4)),
                   c("Question FE", rep(c("Yes", "No"), 2))),
  column.labels   = c("Viewed a page", "Completed a quiz"),
  column.separate = c(2, 2),
  dep.var.caption = "Compliance",
  omit.stat = c("adj.rsq", "ser","f"),
  dep.var.labels.include = FALSE,
  covariate.labels = "Treatment (OSI available)",
  model.names = FALSE,
  title = "Compliance to treatment, first stage.",
  label = "table:first-stage",
  notes = "Standard errors clustered by student in all columns.",
  type = "latex",
  out = here("replication", 
             "tables",
             "iv_first_stage.tex")
)

# Remove objects
rm(ivfs_m1, ivfs_m2, ivfs_m3, ivfs_m4, 
   fst1, fst2, fst3, fst4, ivfs_mdls, dfd_ivf,
   Y, D1, D2, Z, cl, FE, FE2)

# Appendix Table 2: Student-level summary statistics ----------------------

sumtable(
  pdat[, c("viewed_page",
           "took_quiz",
           "urm")],
  labels = c("Viewed at least one page",
             "Completed at least one quiz",
             "URM Status"),
  out = "csv",
  file = here("replication",
              "tables",
              "summary_student_dem.csv")
)

# Remove student-level data frame
rm(pdat)

# Appendix Table 3: Summary statistics for question scores ----------------

write.csv(
  dfd %>% 
    group_by(question) %>% 
    summarise(
      n = n(),
      mean = mean(pct_score),
      median = median(pct_score),
      sd = sd(pct_score),
      min = min(pct_score),
      pctl25 = quantile(pct_score, .25),
      pctl75 = quantile(pct_score, .75),
      max = max(pct_score)
    ),
  here("replication",
       "tables",
       "summary_question.csv"),
  row.names = FALSE
)

# Appendix Table 4: Robustness check (drop q3a, 3b, OLS-ITT) --------------

# Checking if the results hold after dropping the questions graded by one of the
# co-authors of the study/supplemental resources.

# Subset data
drop_q3 <- dfd %>% filter(qgroup != 3)

# Estimate regression models
mr_dq3_mdls <- list(
  # Model 1: OLS, percent score, access to treatment
  mr_dq3_m1 <- lm(pct_score ~ treatment +
                factor(DeID) +
                factor(question),
              data = drop_q3), 
  # Model 2: IV-2SLS, percent score, page view
  mr_dq3_m2 <- ivreg(
    pct_score ~ viewed_page +
      factor(DeID) +
      factor(question) |
      treatment +
      factor(DeID) +
      factor(question),
    data = drop_q3
  ), 
  # Model 3: IV-2SLS, percent score, quiz completion
  mr_dq3_m3 <- ivreg(
    pct_score ~ took_quiz +
      factor(DeID) +
      factor(question) |
      treatment +
      factor(DeID) +
      factor(question),
    data = drop_q3
  ), 
  # Model 4: OLS, standardized score, access to treatment
  mr_dq3_m4 <- lm(std_score ~ treatment +
                factor(DeID),
              data = drop_q3),
  # Model 5: IV-2SLS, standardized score, page view
  mr_dq3_m5 <- ivreg(std_score ~ viewed_page +
                   factor(DeID) |
                   treatment +
                   factor(DeID),
                 data = drop_q3),
  # Model 6: IV-2SLS, standardized score, quiz completion
  mr_dq3_m6 <- ivreg(std_score ~ took_quiz +
                   factor(DeID) |
                   treatment +
                   factor(DeID),
                 data = drop_q3)
)

# Generate table
stargazer(
  mr_dq3_mdls,
  header = F,
  font.size = "small",
  df = F,
  keep = c("treatment", "viewed_page", "took_quiz"),
  # List apply cluster_se function for clustered standard errors
  se = lapply(mr_dq3_mdls,
              cluster_se),
  add.lines = list(
    c("Model", rep(c(
      "OLS", rep("IV-2SLS", 2)
    ), 2)),
    c("Student FE", rep("Yes", 6)),
    c("Question FE", rep("Yes", 3), rep("No", 3))
  ),
  column.labels   = c("Percent", "Standardized"),
  column.separate = c(3, 3),
  dep.var.caption = "Question Score",
  omit.stat = c("f", "adj.rsq", "ser"),
  dep.var.labels.include = FALSE,
  covariate.labels = c(
    "Treatment (OSI available)",
    "Compliance (viewed a page)",
    "Compliance (completed a quiz)"
  ),
  column.sep.width = "1pt",
  model.names = FALSE,
  title = "Robustness check: Main results with a subset excluding questions 3a and 3b (Confounding and Intervening Variables), which were graded by a co-author of the supplemental online resources. The co-author/grader had access to assignment to treatment by student, but could not see students' names while grading since student names were omitted by the Gradescope application. All remaining questions were graded by TAs without access to the resources.",
  label = "table:main_dq3",
  notes = "Standard errors clustered by student in all columns.",
  type = "latex",
  out = here(
    "replication",
    "tables",
    "main_results_dq3_table.tex"
  )
)

# Remove objects
rm(mr_dq3_m1, mr_dq3_m2, mr_dq3_m3,
   mr_dq3_m4, mr_dq3_m5, mr_dq3_m6,
   mr_dq3_mdls, drop_q3)

# Appendix Table 5: Include student covariates ----------------------------

# Running main results with student-specific covariates after dropping the
# observations withmissing values (182 observations missing either URM or GPA
# values)

# Estimate regression models
mr_covs_mdls <- list(
  # Model 1: OLS, percent score, access to treatment
  mr_covs_m1 <- lm(
    pct_score ~ treatment + gpa + urm +
      factor(DeID) +
      factor(question),
    data = dfd
  ),
  # Model 2: IV-2SLS, percent score, page view
  mr_covs_m2 <- ivreg(
    pct_score ~ viewed_page + gpa + urm +
      factor(DeID) +
      factor(question) |
      treatment + gpa + urm +
      factor(DeID) +
      factor(question),
    data = dfd
  ),
  # Model 3: IV-2SLS, percent score, quiz completion
  mr_covs_m3 <- ivreg(
    pct_score ~ took_quiz + gpa + urm +
      factor(DeID) +
      factor(question) |
      treatment + gpa + urm +
      factor(DeID) +
      factor(question),
    data = dfd
  ),
  # Model 4: OLS, standardized score, access to treatment
  mr_covs_m4 <- lm(std_score ~ treatment + gpa + urm +
                     factor(DeID),
                   data = dfd),
  # Model 5: IV-2SLS, standardized score, page view
  mr_covs_m5 <- ivreg(
    std_score ~ viewed_page + gpa + urm +
      factor(DeID) |
      treatment + gpa + urm +
      factor(DeID),
    data = dfd
  ),
  # Model 6: IV-2SLS, standardized score, quiz completion
  mr_covs_m6 <- ivreg(
    std_score ~ took_quiz +
      factor(DeID) |
      treatment + gpa + urm +
      factor(DeID),
    data = dfd
  )
)

# Generate main results table -- LaTeX
stargazer(
  mr_covs_mdls,
  header = F,
  font.size = "small",
  df = F,
  keep = c("treatment", "viewed_page", "took_quiz"),
  # List apply cluster_se function for clustered standard errors
  se = lapply(mr_covs_mdls,
              cluster_se),
  add.lines = list(
    c("Model", rep(c(
      "OLS", rep("IV-2SLS", 2)
    ), 2)),
    c("Student Covariates", rep("Yes", 6)),
    c("Student FE", rep("Yes", 6)),
    c("Question FE", rep("Yes", 3), rep("No", 3))
  ),
  column.labels   = c("Percent", "Standardized"),
  column.separate = c(3, 3),
  dep.var.caption = "Question Score",
  omit.stat = c("f", "adj.rsq", "ser"),
  dep.var.labels.include = FALSE,
  covariate.labels = c(
    "Treatment (OSI available)",
    "Compliance (viewed a page)",
    "Compliance (completed a quiz)"
  ),
  column.sep.width = "1pt",
  model.names = FALSE,
  title = "Main results including student-specific covariates (GPA and URM) and dropping observations with missing covariate values.",
  label = "table:main_covs",
  notes = "Standard errors clustered by student in all columns.",
  type = "latex",
  out = here("replication",
             "tables",
             "main_results_covs_table.tex")
)

# Remove objects
<<<<<<< HEAD:replication/soripme_code_appendix.R
rm(mr_covs_m1,
   mr_covs_m2,
   mr_covs_m3,
   mr_covs_m4,
   mr_covs_m5,
   mr_covs_m6,
   mr_covs_mdls,
   cluster_se,
   dfd)
=======
rm(mr_covs_m1, mr_covs_m2, mr_covs_m3,
   mr_covs_m4, mr_covs_m5, mr_covs_m6,
   mr_covs_mdls)

## Randomization inference ------------------------------------------------

# Subset variables in main dataset
dfd <- dfd %>% 
  select(DeID, treatment_group, 
         question, treatment, pct_score)

# Read dataset with correspondences between learning modules, treatment groups,
# and questions (as in the Pre-Analysis Plan)
mtq <- readRDS(here::here("replication",
                          "module_treatgroup_question.rds"))

# Create a data frame informing which questions are treated for each treatment
# group (based upon mtq)
# Necessary to simulate placebo statuses
tqs <- dfd %>% distinct(treatment_group, question)
for (i in 1:nrow(tqs)) {
  tqs$treated[i] <-
    ifelse(sum(mtq[which(mtq[, paste0("tg",
                                      tqs[i, ]$treatment_group)] == 1),
                   tqs[i, ]$question]) > 0,
           1, 0)
}

# Set seed
set.seed(220419)
# Set number of simulations
nsims <- 5000
# Placeholder list for placebo statuses
placebo_statuses <- vector(mode = "list", length = nsims)
# Placeholder list for regression models
randinf_models <- vector(mode = "list", length = nsims)
# Placeholder vector for placebo coefficients
placebo_coefs <- rep(NA, nsims)

# Simulate placebo statuses, run regressions, and retrieve coefficients
for(i in 1:nsims) {
  # Generate list containing dataframes with different placebo statuses
  placebo_statuses[[i]] <- dfd %>% distinct(DeID) %>%
    mutate(placebo_group = sample(1:6,
                                  nrow(dfd %>% distinct(DeID)),
                                  replace = TRUE)) %>%
    right_join(dfd, by = "DeID") %>%
    left_join((tqs %>% rename(placebo_group = treatment_group)),
              by = c("placebo_group", "question")) %>%
    rename(placebo = treated)
  # Run regressions with placebo treatments
  randinf_models[[i]] <- lm(
    pct_score ~ placebo +
      factor(DeID) +
      factor(question),
    data = placebo_statuses[[i]]
  )
  # Retrieve coefficients for placebo
  placebo_coefs[i] <- randinf_models[[i]]$coefficients["placebo"]
}

# Remove objects
rm(randinf_models, placebo_statuses, 
   mtq, tqs)

# Obtain observed coefficient (under observed treatment status)
observed_coef <- lm(pct_score ~ treatment + 
                      factor(DeID) + 
                      factor(question), 
                    data = dfd)$coefficients[2]

# P-value for sharp null test
sum(placebo_coefs > observed_coef)/nsims

# Histogram with placebo coefficients (n = nsims); vertical line depicting the
# observed coefficient
riplot <- ggplot() +
  geom_histogram(
    aes(placebo_coefs),
    color = "black",
    fill = "white",
    #breaks = seq(-5, 5, 0.5),
    bins = nsims
  ) +
  geom_vline(aes(xintercept = observed_coef),
             linetype = "dashed", color = "red") +
  labs(
    x = "Coefficients",
    y = "Frequency",
    title = expression(paste("Coefficients for ",
                             italic(Placebo[iq]))),
    subtitle = expression(
      paste(
        "Vertical line depicting the observed coefficient for ",
        italic(Treatment[iq])
      )
    )
  ) +
  scale_x_continuous(breaks = -5:5) +
  #scale_y_continuous(breaks = seq(0, 750, 25)) +
  theme_bw() +
  theme(
    plot.title = element_text(size = 14),
    plot.subtitle = element_text(size = 10),
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 12),
    strip.text.x = element_text(size = 12)
  )

# Save plot
ggsave(here("replication",
            "figures",
            "randomization_inference.pdf"),
       plot = riplot,
       width = 13.5,
       height = 7.25,
       units = "cm")
>>>>>>> ac8d7f9208282e6d9044904ab0be68011dc13b2a:replication/soripme_code.R
